// Copyright 2021 ByteDance Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by command: go run gen.go -avx2 -out ./avx2.s. DO NOT EDIT.

#include "textflag.h"

DATA prime_avx<>+0(SB)/4, $0x9e3779b1
GLOBL prime_avx<>(SB), RODATA|NOPTR, $4

// func accumAVX2(acc *[8]uint64, xinput *byte, xsecret *byte, len uint64)
// Requires: AVX, AVX2
TEXT ·accumAVX2(SB), NOSPLIT, $0-32
	MOVQ         acc+0(FP), AX
	MOVQ         xinput+8(FP), CX
	MOVQ         xsecret+16(FP), DX
	MOVQ         xsecret+16(FP), BX
	MOVQ         len+24(FP), SI
	VMOVDQU      (AX), Y1
	VMOVDQU      32(AX), Y2
	VPBROADCASTQ prime_avx<>+0(SB), Y0

accumBlock:
	CMPQ     SI, $0x00000400
	JLE      accumStripe
	VMOVDQU  (CX), Y3
	VMOVDQU  (DX), Y4
	VMOVDQU  32(CX), Y6
	VMOVDQU  32(DX), Y7
	VMOVDQU  64(CX), Y8
	VMOVDQU  8(DX), Y9
	VMOVDQU  96(CX), Y10
	VMOVDQU  40(DX), Y11
	VPXOR    Y3, Y4, Y4
	VPSRLQ   $0x20, Y4, Y5
	VPSHUFD  $0x4e, Y3, Y3
	VPMULUDQ Y4, Y5, Y5
	VPADDQ   Y1, Y3, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y6, Y7, Y7
	VPSRLQ   $0x20, Y7, Y5
	VPSHUFD  $0x4e, Y6, Y6
	VPMULUDQ Y7, Y5, Y5
	VPADDQ   Y2, Y6, Y2
	VPADDQ   Y2, Y5, Y2
	VPXOR    Y8, Y9, Y9
	VPSRLQ   $0x20, Y9, Y5
	VPSHUFD  $0x4e, Y8, Y8
	VPMULUDQ Y9, Y5, Y5
	VPADDQ   Y1, Y8, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y10, Y11, Y11
	VPSRLQ   $0x20, Y11, Y5
	VPSHUFD  $0x4e, Y10, Y10
	VPMULUDQ Y11, Y5, Y5
	VPADDQ   Y2, Y10, Y2
	VPADDQ   Y2, Y5, Y2
	VMOVDQU  128(CX), Y3
	VMOVDQU  16(DX), Y4
	VMOVDQU  160(CX), Y6
	VMOVDQU  48(DX), Y7
	VMOVDQU  192(CX), Y8
	VMOVDQU  24(DX), Y9
	VMOVDQU  224(CX), Y10
	VMOVDQU  56(DX), Y11
	VPXOR    Y3, Y4, Y4
	VPSRLQ   $0x20, Y4, Y5
	VPSHUFD  $0x4e, Y3, Y3
	VPMULUDQ Y4, Y5, Y5
	VPADDQ   Y1, Y3, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y6, Y7, Y7
	VPSRLQ   $0x20, Y7, Y5
	VPSHUFD  $0x4e, Y6, Y6
	VPMULUDQ Y7, Y5, Y5
	VPADDQ   Y2, Y6, Y2
	VPADDQ   Y2, Y5, Y2
	VPXOR    Y8, Y9, Y9
	VPSRLQ   $0x20, Y9, Y5
	VPSHUFD  $0x4e, Y8, Y8
	VPMULUDQ Y9, Y5, Y5
	VPADDQ   Y1, Y8, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y10, Y11, Y11
	VPSRLQ   $0x20, Y11, Y5
	VPSHUFD  $0x4e, Y10, Y10
	VPMULUDQ Y11, Y5, Y5
	VPADDQ   Y2, Y10, Y2
	VPADDQ   Y2, Y5, Y2
	VMOVDQU  256(CX), Y3
	VMOVDQU  32(DX), Y4
	VMOVDQU  288(CX), Y6
	VMOVDQU  64(DX), Y7
	VMOVDQU  320(CX), Y8
	VMOVDQU  40(DX), Y9
	VMOVDQU  352(CX), Y10
	VMOVDQU  72(DX), Y11
	VPXOR    Y3, Y4, Y4
	VPSRLQ   $0x20, Y4, Y5
	VPSHUFD  $0x4e, Y3, Y3
	VPMULUDQ Y4, Y5, Y5
	VPADDQ   Y1, Y3, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y6, Y7, Y7
	VPSRLQ   $0x20, Y7, Y5
	VPSHUFD  $0x4e, Y6, Y6
	VPMULUDQ Y7, Y5, Y5
	VPADDQ   Y2, Y6, Y2
	VPADDQ   Y2, Y5, Y2
	VPXOR    Y8, Y9, Y9
	VPSRLQ   $0x20, Y9, Y5
	VPSHUFD  $0x4e, Y8, Y8
	VPMULUDQ Y9, Y5, Y5
	VPADDQ   Y1, Y8, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y10, Y11, Y11
	VPSRLQ   $0x20, Y11, Y5
	VPSHUFD  $0x4e, Y10, Y10
	VPMULUDQ Y11, Y5, Y5
	VPADDQ   Y2, Y10, Y2
	VPADDQ   Y2, Y5, Y2
	VMOVDQU  384(CX), Y3
	VMOVDQU  48(DX), Y4
	VMOVDQU  416(CX), Y6
	VMOVDQU  80(DX), Y7
	VMOVDQU  448(CX), Y8
	VMOVDQU  56(DX), Y9
	VMOVDQU  480(CX), Y10
	VMOVDQU  88(DX), Y11
	VPXOR    Y3, Y4, Y4
	VPSRLQ   $0x20, Y4, Y5
	VPSHUFD  $0x4e, Y3, Y3
	VPMULUDQ Y4, Y5, Y5
	VPADDQ   Y1, Y3, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y6, Y7, Y7
	VPSRLQ   $0x20, Y7, Y5
	VPSHUFD  $0x4e, Y6, Y6
	VPMULUDQ Y7, Y5, Y5
	VPADDQ   Y2, Y6, Y2
	VPADDQ   Y2, Y5, Y2
	VPXOR    Y8, Y9, Y9
	VPSRLQ   $0x20, Y9, Y5
	VPSHUFD  $0x4e, Y8, Y8
	VPMULUDQ Y9, Y5, Y5
	VPADDQ   Y1, Y8, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y10, Y11, Y11
	VPSRLQ   $0x20, Y11, Y5
	VPSHUFD  $0x4e, Y10, Y10
	VPMULUDQ Y11, Y5, Y5
	VPADDQ   Y2, Y10, Y2
	VPADDQ   Y2, Y5, Y2
	VMOVDQU  512(CX), Y3
	VMOVDQU  64(DX), Y4
	VMOVDQU  544(CX), Y6
	VMOVDQU  96(DX), Y7
	VMOVDQU  576(CX), Y8
	VMOVDQU  72(DX), Y9
	VMOVDQU  608(CX), Y10
	VMOVDQU  104(DX), Y11
	VPXOR    Y3, Y4, Y4
	VPSRLQ   $0x20, Y4, Y5
	VPSHUFD  $0x4e, Y3, Y3
	VPMULUDQ Y4, Y5, Y5
	VPADDQ   Y1, Y3, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y6, Y7, Y7
	VPSRLQ   $0x20, Y7, Y5
	VPSHUFD  $0x4e, Y6, Y6
	VPMULUDQ Y7, Y5, Y5
	VPADDQ   Y2, Y6, Y2
	VPADDQ   Y2, Y5, Y2
	VPXOR    Y8, Y9, Y9
	VPSRLQ   $0x20, Y9, Y5
	VPSHUFD  $0x4e, Y8, Y8
	VPMULUDQ Y9, Y5, Y5
	VPADDQ   Y1, Y8, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y10, Y11, Y11
	VPSRLQ   $0x20, Y11, Y5
	VPSHUFD  $0x4e, Y10, Y10
	VPMULUDQ Y11, Y5, Y5
	VPADDQ   Y2, Y10, Y2
	VPADDQ   Y2, Y5, Y2
	VMOVDQU  640(CX), Y3
	VMOVDQU  80(DX), Y4
	VMOVDQU  672(CX), Y6
	VMOVDQU  112(DX), Y7
	VMOVDQU  704(CX), Y8
	VMOVDQU  88(DX), Y9
	VMOVDQU  736(CX), Y10
	VMOVDQU  120(DX), Y11
	VPXOR    Y3, Y4, Y4
	VPSRLQ   $0x20, Y4, Y5
	VPSHUFD  $0x4e, Y3, Y3
	VPMULUDQ Y4, Y5, Y5
	VPADDQ   Y1, Y3, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y6, Y7, Y7
	VPSRLQ   $0x20, Y7, Y5
	VPSHUFD  $0x4e, Y6, Y6
	VPMULUDQ Y7, Y5, Y5
	VPADDQ   Y2, Y6, Y2
	VPADDQ   Y2, Y5, Y2
	VPXOR    Y8, Y9, Y9
	VPSRLQ   $0x20, Y9, Y5
	VPSHUFD  $0x4e, Y8, Y8
	VPMULUDQ Y9, Y5, Y5
	VPADDQ   Y1, Y8, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y10, Y11, Y11
	VPSRLQ   $0x20, Y11, Y5
	VPSHUFD  $0x4e, Y10, Y10
	VPMULUDQ Y11, Y5, Y5
	VPADDQ   Y2, Y10, Y2
	VPADDQ   Y2, Y5, Y2
	VMOVDQU  768(CX), Y3
	VMOVDQU  96(DX), Y4
	VMOVDQU  800(CX), Y6
	VMOVDQU  128(DX), Y7
	VMOVDQU  832(CX), Y8
	VMOVDQU  104(DX), Y9
	VMOVDQU  864(CX), Y10
	VMOVDQU  136(DX), Y11
	VPXOR    Y3, Y4, Y4
	VPSRLQ   $0x20, Y4, Y5
	VPSHUFD  $0x4e, Y3, Y3
	VPMULUDQ Y4, Y5, Y5
	VPADDQ   Y1, Y3, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y6, Y7, Y7
	VPSRLQ   $0x20, Y7, Y5
	VPSHUFD  $0x4e, Y6, Y6
	VPMULUDQ Y7, Y5, Y5
	VPADDQ   Y2, Y6, Y2
	VPADDQ   Y2, Y5, Y2
	VPXOR    Y8, Y9, Y9
	VPSRLQ   $0x20, Y9, Y5
	VPSHUFD  $0x4e, Y8, Y8
	VPMULUDQ Y9, Y5, Y5
	VPADDQ   Y1, Y8, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y10, Y11, Y11
	VPSRLQ   $0x20, Y11, Y5
	VPSHUFD  $0x4e, Y10, Y10
	VPMULUDQ Y11, Y5, Y5
	VPADDQ   Y2, Y10, Y2
	VPADDQ   Y2, Y5, Y2
	VMOVDQU  896(CX), Y3
	VMOVDQU  112(DX), Y4
	VMOVDQU  928(CX), Y6
	VMOVDQU  144(DX), Y7
	VMOVDQU  960(CX), Y8
	VMOVDQU  120(DX), Y9
	VMOVDQU  992(CX), Y10
	VMOVDQU  152(DX), Y11
	VPXOR    Y3, Y4, Y4
	VPSRLQ   $0x20, Y4, Y5
	VPSHUFD  $0x4e, Y3, Y3
	VPMULUDQ Y4, Y5, Y5
	VPADDQ   Y1, Y3, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y6, Y7, Y7
	VPSRLQ   $0x20, Y7, Y5
	VPSHUFD  $0x4e, Y6, Y6
	VPMULUDQ Y7, Y5, Y5
	VPADDQ   Y2, Y6, Y2
	VPADDQ   Y2, Y5, Y2
	VPXOR    Y8, Y9, Y9
	VPSRLQ   $0x20, Y9, Y5
	VPSHUFD  $0x4e, Y8, Y8
	VPMULUDQ Y9, Y5, Y5
	VPADDQ   Y1, Y8, Y1
	VPADDQ   Y1, Y5, Y1
	VPXOR    Y10, Y11, Y11
	VPSRLQ   $0x20, Y11, Y5
	VPSHUFD  $0x4e, Y10, Y10
	VPMULUDQ Y11, Y5, Y5
	VPADDQ   Y2, Y10, Y2
	VPADDQ   Y2, Y5, Y2
	ADDQ     $0x00000400, CX
	SUBQ     $0x00000400, SI
	VPSRLQ   $0x2f, Y1, Y3
	VPXOR    Y1, Y3, Y3
	VPXOR    128(DX), Y3, Y3
	VPMULUDQ Y0, Y3, Y4
	VPSRLQ   $0x20, Y3, Y3
	VPMULUDQ Y0, Y3, Y3
	VPSLLQ   $0x20, Y3, Y3
	VPADDQ   Y4, Y3, Y1
	VPSRLQ   $0x2f, Y2, Y3
	VPXOR    Y2, Y3, Y3
	VPXOR    160(DX), Y3, Y3
	VPMULUDQ Y0, Y3, Y4
	VPSRLQ   $0x20, Y3, Y3
	VPMULUDQ Y0, Y3, Y3
	VPSLLQ   $0x20, Y3, Y3
	VPADDQ   Y4, Y3, Y2
	JMP      accumBlock

accumStripe:
	CMPQ     SI, $0x40
	JLE      accumLastStripe
	VMOVDQU  (CX), Y0
	VMOVDQU  (BX), Y3
	VMOVDQU  32(CX), Y5
	VMOVDQU  32(BX), Y6
	VPXOR    Y0, Y3, Y3
	VPSRLQ   $0x20, Y3, Y4
	VPSHUFD  $0x4e, Y0, Y0
	VPMULUDQ Y3, Y4, Y4
	VPADDQ   Y1, Y0, Y1
	VPADDQ   Y1, Y4, Y1
	VPXOR    Y5, Y6, Y6
	VPSRLQ   $0x20, Y6, Y4
	VPMULUDQ Y6, Y4, Y4
	VPSHUFD  $0x4e, Y5, Y5
	VPADDQ   Y2, Y5, Y2
	VPADDQ   Y2, Y4, Y2
	ADDQ     $0x00000040, CX
	SUBQ     $0x00000040, SI
	ADDQ     $0x00000008, BX
	JMP      accumStripe

accumLastStripe:
	CMPQ     SI, $0x00
	JE       return
	SUBQ     $0x40, CX
	ADDQ     SI, CX
	VMOVDQU  (CX), Y0
	VMOVDQU  121(DX), Y3
	VMOVDQU  32(CX), Y5
	VMOVDQU  153(DX), Y6
	VPXOR    Y0, Y3, Y3
	VPSRLQ   $0x20, Y3, Y4
	VPSHUFD  $0x4e, Y0, Y0
	VPMULUDQ Y3, Y4, Y4
	VPADDQ   Y1, Y0, Y1
	VPADDQ   Y1, Y4, Y1
	VPXOR    Y5, Y6, Y6
	VPSRLQ   $0x20, Y6, Y4
	VPMULUDQ Y6, Y4, Y4
	VPSHUFD  $0x4e, Y5, Y5
	VPADDQ   Y2, Y5, Y2
	VPADDQ   Y2, Y4, Y2

return:
	VMOVDQU Y1, (AX)
	VMOVDQU Y2, 32(AX)
	RET
